Refined Cleaning Methods

Session Isolation

race_sessions <- dplyr::filter(session_data, session_name == "Race")
race_sessions <- race_sessions[!duplicated(race_sessions$session_key),] #Just  procoutionary, in the current state, this does nothing

Driver Data Cleanup

Cleaning up the driver data into a usable state prior to use

Merging RB and Racing Bulls

driver_names <- driver_names %>%
  mutate(team_name = ifelse(team_name == "RB", "Racing Bulls", team_name))

Fixing Color Codes

driver_names$team_colour <- paste0("#", driver_names$team_colour)

driver_names <- driver_names %>%
  group_by(team_name) %>%
  mutate(team_colour = first(team_colour)) %>%
  ungroup()

Filtering to Race Drivers Only

race_drivers_all <- dplyr::filter(driver_names, session_key %in% race_sessions$session_key)

Adding Driver Info

limited_drivers <- race_drivers_all %>%
  select(-meeting_key, -broadcast_name, -first_name, -last_name)

Pitstop Data Cleaning

In this section I am cleaning the pitstop_data$pit_duration to remove na values, and outliers. ### Filtering Down to Race Pitstops

pitstop_data <- pitstop_data %>% 
  filter(session_key %in% race_sessions$session_key)

Joining Driver Information

pitstop_data <- pitstop_data %>%
  left_join(limited_drivers, by = c("session_key", "driver_number")) %>%
  filter(!is.na(pit_duration))

Outlier Determination

  • Data is tracked from the start of the pitlane to the end, regardless of condition. Under a safety car, all cars are pitted for long periods of time, same is true for a DNF.
  • Due to this, I intend to remove all pitstops which exceed 150 seconds first, after which point I will remove the outliers of the remaining set.
summary(pitstop_data$pit_duration)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    12.8    21.8    23.4   103.0    26.8  2485.9
pitstop_data <- pitstop_data[pitstop_data$pit_duration <= 150,]
pitstop_duration_Q1 <- quantile(pitstop_data$pit_duration, 0.25, na.rm = TRUE)
pitstop_duration_Q3 <- quantile(pitstop_data$pit_duration, 0.75, na.rm = TRUE)
pitstop_iqr <- pitstop_duration_Q3 - pitstop_duration_Q1 #I could also just do this with pitstop_iqr <- IQR(pitstop_data$pit_duration, na.rm = TRUE)
pitstop_median <- median(pitstop_data$pit_duration, na.rm = TRUE)
summary(pitstop_data$pit_duration)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.80   21.70   23.30   24.28   25.30   93.20
# pitstop_data <- pitstop_data %>%
#   filter(pit_duration >= (pitstop_duration_Q1 - 1.5 * pitstop_iqr) & 
#          pit_duration <= (pitstop_duration_Q3 + 1.5 * pitstop_iqr))

Pitstop Averages

Plotting the Results

Using the Results I have previously gathered, this section will contain several plots created form the aggregated data points

TeamBoxplot <- ggplot(pitstop_data, aes(x = reorder(team_name, pit_duration, median), y = pit_duration, fill = team_colour)) +
  geom_boxplot(alpha = 0.6, outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.3, color = "black") +
  coord_flip() +
  scale_y_continuous(limits = c(quantile(pitstop_data$pit_duration, 0.02, na.rm = TRUE), 
                                 quantile(pitstop_data$pit_duration, 0.97, na.rm = TRUE)),
                     oob = scales::oob_keep) + #looked up
  labs(title = "Pit Stop Duration by Team", x = "Team", y = "Pit Stop Duration (s)") +
  theme_bw()+
  theme(legend.position = "none")+ 
  scale_fill_identity()

ggplotly(TeamBoxplot, width = 1200, height = 700) %>%
  layout(autosize = TRUE)

Plotting Avergaes

AveragesGraph <- ggplot(pitstop_average, aes(x = reorder(driver_label, Mean), y = Mean, fill = team_colour))+
  geom_col()+
  scale_fill_identity()+
  labs(x = "Name Acronyms", y = "Mean (s)", title = "Averages Plot")+
  theme(text = element_text(angle = 45, hjust = 1))

ggplotly(AveragesGraph, width = 1200, height = 700) %>%
  layout(autosize = TRUE)